{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "b5994ad4",
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:00.546001Z",
     "iopub.status.busy": "2021-09-19T19:53:00.545095Z",
     "iopub.status.idle": "2021-09-19T19:53:01.555192Z",
     "shell.execute_reply": "2021-09-19T19:53:01.556144Z",
     "shell.execute_reply.started": "2021-09-19T19:36:55.776083Z"
    },
    "papermill": {
     "duration": 1.052157,
     "end_time": "2021-09-19T19:53:01.556613",
     "exception": false,
     "start_time": "2021-09-19T19:53:00.504456",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import make_classification\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X, y = make_classification(n_samples=5000, n_features=50, \n",
    "                           n_informative=10,\n",
    "                           n_redundant=25, n_repeated=15, \n",
    "                           n_clusters_per_class=5,\n",
    "                           flip_y=0.05, class_sep=0.5, \n",
    "                           random_state=0)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "fd75371e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:01.651956Z",
     "iopub.status.busy": "2021-09-19T19:53:01.650857Z",
     "iopub.status.idle": "2021-09-19T19:53:01.655983Z",
     "shell.execute_reply": "2021-09-19T19:53:01.656880Z",
     "shell.execute_reply.started": "2021-09-19T19:36:55.805534Z"
    },
    "papermill": {
     "duration": 0.058079,
     "end_time": "2021-09-19T19:53:01.657143",
     "exception": false,
     "start_time": "2021-09-19T19:53:01.599064",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"\\n# As an alternative to the make_classification synthetic data,\\n# you may decide to use the Madelon dataset by using the code\\n# in this commented cell\\n\\nfrom sklearn.datasets import fetch_openml\\nfrom sklearn.model_selection import train_test_split\\ndata = fetch_openml(name='madelon')\\nX = data.data.astype(float)\\nX = X / X.mean()\\ny = (data.target=='2').astype(float)\\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)\\n\""
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "# As an alternative to the make_classification synthetic data,\n",
    "# you may decide to use the Madelon dataset by using the code\n",
    "# in this commented cell\n",
    "\n",
    "from sklearn.datasets import fetch_openml\n",
    "from sklearn.model_selection import train_test_split\n",
    "data = fetch_openml(name='madelon')\n",
    "X = data.data.astype(float)\n",
    "X = X / X.mean()\n",
    "y = (data.target=='2').astype(float)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "d972036a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:01.710840Z",
     "iopub.status.busy": "2021-09-19T19:53:01.710166Z",
     "iopub.status.idle": "2021-09-19T19:53:01.886501Z",
     "shell.execute_reply": "2021-09-19T19:53:01.885970Z",
     "shell.execute_reply.started": "2021-09-19T19:36:55.814685Z"
    },
    "papermill": {
     "duration": 0.203251,
     "end_time": "2021-09-19T19:53:01.886647",
     "exception": false,
     "start_time": "2021-09-19T19:53:01.683396",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.metrics import log_loss, roc_auc_score, accuracy_score\n",
    "\n",
    "model_1 = SVC(probability=True, random_state=0)\n",
    "model_2 = RandomForestClassifier(random_state=0)\n",
    "model_3 = KNeighborsClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "b4d7a61e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:01.947741Z",
     "iopub.status.busy": "2021-09-19T19:53:01.946910Z",
     "iopub.status.idle": "2021-09-19T19:53:08.844771Z",
     "shell.execute_reply": "2021-09-19T19:53:08.844108Z",
     "shell.execute_reply.started": "2021-09-19T19:36:55.828384Z"
    },
    "papermill": {
     "duration": 6.933691,
     "end_time": "2021-09-19T19:53:08.844919",
     "exception": false,
     "start_time": "2021-09-19T19:53:01.911228",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KNeighborsClassifier()"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_1.fit(X_train, y_train)\n",
    "model_2.fit(X_train, y_train)\n",
    "model_3.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "cbfd3fe9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:08.902240Z",
     "iopub.status.busy": "2021-09-19T19:53:08.901254Z",
     "iopub.status.idle": "2021-09-19T19:53:09.956348Z",
     "shell.execute_reply": "2021-09-19T19:53:09.955686Z",
     "shell.execute_reply.started": "2021-09-19T19:37:02.706154Z"
    },
    "papermill": {
     "duration": 1.086808,
     "end_time": "2021-09-19T19:53:09.956506",
     "exception": false,
     "start_time": "2021-09-19T19:53:08.869698",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import mode\n",
    "\n",
    "preds = np.stack([model_1.predict(X_test),\n",
    "                  model_2.predict(X_test),\n",
    "                  model_3.predict(X_test)]).T\n",
    "\n",
    "max_voting = np.apply_along_axis(mode, 1, preds)[:,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "cd45079d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:10.011723Z",
     "iopub.status.busy": "2021-09-19T19:53:10.011033Z",
     "iopub.status.idle": "2021-09-19T19:53:10.013722Z",
     "shell.execute_reply": "2021-09-19T19:53:10.014233Z",
     "shell.execute_reply.started": "2021-09-19T19:37:03.767203Z"
    },
    "papermill": {
     "duration": 0.033448,
     "end_time": "2021-09-19T19:53:10.014394",
     "exception": false,
     "start_time": "2021-09-19T19:53:09.980946",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.24\n"
     ]
    }
   ],
   "source": [
    "discordant = np.sum(np.var(preds, axis=1) > 0) / len(y_test)\n",
    "print(f\"{discordant:0.2f}\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "c6e7ab06",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:10.070976Z",
     "iopub.status.busy": "2021-09-19T19:53:10.068520Z",
     "iopub.status.idle": "2021-09-19T19:53:10.076497Z",
     "shell.execute_reply": "2021-09-19T19:53:10.075983Z",
     "shell.execute_reply.started": "2021-09-19T19:37:03.774166Z"
    },
    "papermill": {
     "duration": 0.037388,
     "end_time": "2021-09-19T19:53:10.076631",
     "exception": false,
     "start_time": "2021-09-19T19:53:10.039243",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy for model SVC is: 0.804\n",
      "Accuracy for model RF  is: 0.793\n",
      "Accuracy for model KNN is: 0.805\n"
     ]
    }
   ],
   "source": [
    "for i, model in enumerate(['SVC', 'RF ', 'KNN']):\n",
    "    acc = accuracy_score(y_true=y_test, y_pred=preds[:, i])\n",
    "    print(f\"Accuracy for model {model} is: {acc:0.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "5088fff6",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:10.132894Z",
     "iopub.status.busy": "2021-09-19T19:53:10.130437Z",
     "iopub.status.idle": "2021-09-19T19:53:10.135142Z",
     "shell.execute_reply": "2021-09-19T19:53:10.135760Z",
     "shell.execute_reply.started": "2021-09-19T19:37:03.791838Z"
    },
    "papermill": {
     "duration": 0.034314,
     "end_time": "2021-09-19T19:53:10.135943",
     "exception": false,
     "start_time": "2021-09-19T19:53:10.101629",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy for majority voting is: 0.817\n"
     ]
    }
   ],
   "source": [
    "max_voting_accuray = accuracy_score(y_true=y_test, y_pred=max_voting)\n",
    "print(f\"Accuracy for majority voting is: {max_voting_accuray:0.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "b8769a36",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:10.192984Z",
     "iopub.status.busy": "2021-09-19T19:53:10.192323Z",
     "iopub.status.idle": "2021-09-19T19:53:11.000348Z",
     "shell.execute_reply": "2021-09-19T19:53:11.000905Z",
     "shell.execute_reply.started": "2021-09-19T19:37:03.804667Z"
    },
    "papermill": {
     "duration": 0.839711,
     "end_time": "2021-09-19T19:53:11.001092",
     "exception": false,
     "start_time": "2021-09-19T19:53:10.161381",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "proba = np.stack([model_1.predict_proba(X_test)[:, 1],\n",
    "                  model_2.predict_proba(X_test)[:, 1],\n",
    "                  model_3.predict_proba(X_test)[:, 1]]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "db12a6a7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.057741Z",
     "iopub.status.busy": "2021-09-19T19:53:11.055316Z",
     "iopub.status.idle": "2021-09-19T19:53:11.068234Z",
     "shell.execute_reply": "2021-09-19T19:53:11.068758Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.622064Z"
    },
    "papermill": {
     "duration": 0.042392,
     "end_time": "2021-09-19T19:53:11.068932",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.026540",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC for model SVC is: 0.88126\n",
      "ROC-AUC for model RF  is: 0.87685\n",
      "ROC-AUC for model KNN is: 0.87511\n"
     ]
    }
   ],
   "source": [
    "for i, model in enumerate(['SVC', 'RF ', 'KNN']):\n",
    "    ras = roc_auc_score(y_true=y_test, y_score=proba[:, i])\n",
    "    print(f\"ROC-AUC for model {model} is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "5763c515",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.124412Z",
     "iopub.status.busy": "2021-09-19T19:53:11.123754Z",
     "iopub.status.idle": "2021-09-19T19:53:11.129682Z",
     "shell.execute_reply": "2021-09-19T19:53:11.130132Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.636518Z"
    },
    "papermill": {
     "duration": 0.035446,
     "end_time": "2021-09-19T19:53:11.130305",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.094859",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean averaging ROC-AUC is: 0.90192\n"
     ]
    }
   ],
   "source": [
    "arithmetic = proba.mean(axis=1)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=arithmetic)\n",
    "print(f\"Mean averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "ccbed85a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.187806Z",
     "iopub.status.busy": "2021-09-19T19:53:11.187129Z",
     "iopub.status.idle": "2021-09-19T19:53:11.192851Z",
     "shell.execute_reply": "2021-09-19T19:53:11.192314Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.646724Z"
    },
    "papermill": {
     "duration": 0.036553,
     "end_time": "2021-09-19T19:53:11.192994",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.156441",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Geometric averaging ROC-AUC is: 0.89857\n"
     ]
    }
   ],
   "source": [
    "geometric = proba.prod(axis=1)**(1/3)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=geometric)\n",
    "print(f\"Geometric averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "d1f76981",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.249592Z",
     "iopub.status.busy": "2021-09-19T19:53:11.248971Z",
     "iopub.status.idle": "2021-09-19T19:53:11.255580Z",
     "shell.execute_reply": "2021-09-19T19:53:11.256053Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.659367Z"
    },
    "papermill": {
     "duration": 0.03682,
     "end_time": "2021-09-19T19:53:11.256222",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.219402",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Geometric averaging ROC-AUC is: 0.89916\n"
     ]
    }
   ],
   "source": [
    "harmonic = 1 / np.mean(1. / (proba + 0.00001), axis=1)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=harmonic)\n",
    "print(f\"Geometric averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "63e39bf0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean of powers averaging ROC-AUC is: 0.89996\n"
     ]
    }
   ],
   "source": [
    "n = 3\n",
    "mean_of_powers = np.mean(proba**n, axis=1)**(1/n)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=mean_of_powers)\n",
    "print(f\"Mean of powers averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "87d398a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logarithmic averaging ROC-AUC is: 0.90179\n"
     ]
    }
   ],
   "source": [
    "logarithmic = np.expm1(np.mean(np.log1p(proba), axis=1))\n",
    "ras = roc_auc_score(y_true=y_test, y_score=logarithmic)\n",
    "print(f\"Logarithmic averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9791814a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.317142Z",
     "iopub.status.busy": "2021-09-19T19:53:11.313830Z",
     "iopub.status.idle": "2021-09-19T19:53:11.323748Z",
     "shell.execute_reply": "2021-09-19T19:53:11.323201Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.671686Z"
    },
    "papermill": {
     "duration": 0.040622,
     "end_time": "2021-09-19T19:53:11.323885",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.283263",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Weighted averaging ROC-AUC is: 0.90206\n"
     ]
    }
   ],
   "source": [
    "cormat = np.corrcoef(proba.T)\n",
    "np.fill_diagonal(cormat, 0.0)\n",
    "W = 1 / np.mean(cormat, axis=1)\n",
    "W = W / sum(W) # normalizing to sum==1.0\n",
    "weighted = proba.dot(W)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=weighted)\n",
    "print(f\"Weighted averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "24d1dc68",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.384852Z",
     "iopub.status.busy": "2021-09-19T19:53:11.381529Z",
     "iopub.status.idle": "2021-09-19T19:53:11.390582Z",
     "shell.execute_reply": "2021-09-19T19:53:11.391056Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.686934Z"
    },
    "papermill": {
     "duration": 0.040061,
     "end_time": "2021-09-19T19:53:11.391229",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.351168",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean averaging ROC-AUC is: 0.90180\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "arithmetic = MinMaxScaler().fit_transform(proba).mean(axis=1)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=arithmetic)\n",
    "print(f\"Mean averaging ROC-AUC is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d548eb72",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:11.450375Z",
     "iopub.status.busy": "2021-09-19T19:53:11.449373Z",
     "iopub.status.idle": "2021-09-19T19:53:36.050533Z",
     "shell.execute_reply": "2021-09-19T19:53:36.050008Z",
     "shell.execute_reply.started": "2021-09-19T19:37:04.703326Z"
    },
    "papermill": {
     "duration": 24.631864,
     "end_time": "2021-09-19T19:53:36.050703",
     "exception": false,
     "start_time": "2021-09-19T19:53:11.418839",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FOLD 0 Mean averaging ROC-AUC is: 0.88202\n",
      "FOLD 1 Mean averaging ROC-AUC is: 0.87379\n",
      "FOLD 2 Mean averaging ROC-AUC is: 0.91092\n",
      "FOLD 3 Mean averaging ROC-AUC is: 0.87909\n",
      "FOLD 4 Mean averaging ROC-AUC is: 0.89224\n",
      "CV Mean averaging ROC-AUC is: 0.88761\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "\n",
    "kf = KFold(n_splits=5, shuffle=True, random_state=0)\n",
    "scores = list()\n",
    "\n",
    "for k, (train_index, test_index) in enumerate(kf.split(X_train)):\n",
    "    model_1.fit(X_train[train_index, :], y_train[train_index])\n",
    "    model_2.fit(X_train[train_index, :], y_train[train_index])\n",
    "    model_3.fit(X_train[train_index, :], y_train[train_index])\n",
    "    \n",
    "    proba = np.stack([model_1.predict_proba(X_train[test_index, :])[:, 1],\n",
    "                      model_2.predict_proba(X_train[test_index, :])[:, 1],\n",
    "                      model_3.predict_proba(X_train[test_index, :])[:, 1]]).T\n",
    "    \n",
    "    arithmetic = proba.mean(axis=1)\n",
    "    ras = roc_auc_score(y_true=y_train[test_index], y_score=arithmetic)\n",
    "    scores.append(ras)\n",
    "    print(f\"FOLD {k} Mean averaging ROC-AUC is: {ras:0.5f}\")\n",
    "    \n",
    "print(f\"CV Mean averaging ROC-AUC is: {np.mean(scores):0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "36b279fd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:36.116721Z",
     "iopub.status.busy": "2021-09-19T19:53:36.116042Z",
     "iopub.status.idle": "2021-09-19T19:53:40.575193Z",
     "shell.execute_reply": "2021-09-19T19:53:40.574567Z",
     "shell.execute_reply.started": "2021-09-19T19:37:29.529871Z"
    },
    "papermill": {
     "duration": 4.495479,
     "end_time": "2021-09-19T19:53:40.575333",
     "exception": false,
     "start_time": "2021-09-19T19:53:36.079854",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "X_blend, X_holdout, y_blend, y_holdout = train_test_split(X_train, y_train, test_size=0.25, random_state=0)\n",
    "\n",
    "model_1.fit(X_blend, y_blend)\n",
    "model_2.fit(X_blend, y_blend)\n",
    "model_3.fit(X_blend, y_blend)\n",
    "\n",
    "proba = np.stack([model_1.predict_proba(X_holdout)[:, 1],\n",
    "                  model_2.predict_proba(X_holdout)[:, 1],\n",
    "                  model_3.predict_proba(X_holdout)[:, 1]]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ff9a3f7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "scaler = StandardScaler()\n",
    "proba = scaler.fit_transform(proba)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "cfc010d9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:40.638551Z",
     "iopub.status.busy": "2021-09-19T19:53:40.637926Z",
     "iopub.status.idle": "2021-09-19T19:53:40.644748Z",
     "shell.execute_reply": "2021-09-19T19:53:40.644105Z",
     "shell.execute_reply.started": "2021-09-19T19:37:34.042872Z"
    },
    "papermill": {
     "duration": 0.040697,
     "end_time": "2021-09-19T19:53:40.644890",
     "exception": false,
     "start_time": "2021-09-19T19:53:40.604193",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(solver='liblinear')"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "blender = LogisticRegression(solver='liblinear')\n",
    "blender.fit(proba, y_holdout)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a3857934",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:40.708724Z",
     "iopub.status.busy": "2021-09-19T19:53:40.706100Z",
     "iopub.status.idle": "2021-09-19T19:53:40.712069Z",
     "shell.execute_reply": "2021-09-19T19:53:40.711360Z",
     "shell.execute_reply.started": "2021-09-19T19:37:34.051890Z"
    },
    "papermill": {
     "duration": 0.038045,
     "end_time": "2021-09-19T19:53:40.712221",
     "exception": false,
     "start_time": "2021-09-19T19:53:40.674176",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.78911314 0.47202077 0.75115854]]\n"
     ]
    }
   ],
   "source": [
    "print(blender.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "0dca82ee",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:40.778198Z",
     "iopub.status.busy": "2021-09-19T19:53:40.777508Z",
     "iopub.status.idle": "2021-09-19T19:53:41.430060Z",
     "shell.execute_reply": "2021-09-19T19:53:41.429513Z",
     "shell.execute_reply.started": "2021-09-19T19:37:34.065878Z"
    },
    "papermill": {
     "duration": 0.688341,
     "end_time": "2021-09-19T19:53:41.430202",
     "exception": false,
     "start_time": "2021-09-19T19:53:40.741861",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC for linear blending KNN is: 0.88621\n"
     ]
    }
   ],
   "source": [
    "test_proba = np.stack([model_1.predict_proba(X_test)[:, 1],\n",
    "                       model_2.predict_proba(X_test)[:, 1],\n",
    "                       model_3.predict_proba(X_test)[:, 1]]).T\n",
    "\n",
    "blending = blender.predict_proba(test_proba)[:, 1]\n",
    "ras = roc_auc_score(y_true=y_test, y_score=blending)\n",
    "print(f\"ROC-AUC for linear blending {model} is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "689e7d97",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:41.533221Z",
     "iopub.status.busy": "2021-09-19T19:53:41.496952Z",
     "iopub.status.idle": "2021-09-19T19:53:42.424367Z",
     "shell.execute_reply": "2021-09-19T19:53:42.424909Z",
     "shell.execute_reply.started": "2021-09-19T19:37:34.727721Z"
    },
    "papermill": {
     "duration": 0.964892,
     "end_time": "2021-09-19T19:53:42.425083",
     "exception": false,
     "start_time": "2021-09-19T19:53:41.460191",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC for non-linear blending KNN is: 0.83862\n"
     ]
    }
   ],
   "source": [
    "blender = RandomForestClassifier()\n",
    "blender.fit(proba, y_holdout)\n",
    "\n",
    "test_proba = np.stack([model_1.predict_proba(X_test)[:, 1],\n",
    "                       model_2.predict_proba(X_test)[:, 1],\n",
    "                       model_3.predict_proba(X_test)[:, 1]]).T\n",
    "\n",
    "blending = blender.predict_proba(test_proba)[:, 1]\n",
    "ras = roc_auc_score(y_true=y_test, y_score=blending)\n",
    "print(f\"ROC-AUC for non-linear blending {model} is: {ras:0.5f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "d483ef3c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:42.494830Z",
     "iopub.status.busy": "2021-09-19T19:53:42.494176Z",
     "iopub.status.idle": "2021-09-19T19:53:45.127241Z",
     "shell.execute_reply": "2021-09-19T19:53:45.126342Z",
     "shell.execute_reply.started": "2021-09-19T19:37:35.641083Z"
    },
    "papermill": {
     "duration": 2.671601,
     "end_time": "2021-09-19T19:53:45.127386",
     "exception": false,
     "start_time": "2021-09-19T19:53:42.455785",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "X_blend, X_holdout, y_blend, y_holdout = train_test_split(X_train, y_train, test_size=0.5, random_state=0)\n",
    "\n",
    "model_1.fit(X_blend, y_blend)\n",
    "model_2.fit(X_blend, y_blend)\n",
    "model_3.fit(X_blend, y_blend)\n",
    "\n",
    "proba = np.stack([model_1.predict_proba(X_holdout)[:, 1],\n",
    "                  model_2.predict_proba(X_holdout)[:, 1],\n",
    "                  model_3.predict_proba(X_holdout)[:, 1]]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "0c5c393c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:45.191275Z",
     "iopub.status.busy": "2021-09-19T19:53:45.190635Z",
     "iopub.status.idle": "2021-09-19T19:53:45.744296Z",
     "shell.execute_reply": "2021-09-19T19:53:45.743734Z",
     "shell.execute_reply.started": "2021-09-19T19:37:38.247690Z"
    },
    "papermill": {
     "duration": 0.586706,
     "end_time": "2021-09-19T19:53:45.744483",
     "exception": false,
     "start_time": "2021-09-19T19:53:45.157777",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "starting baseline is 0.50000\n",
      "Adding model_3 to the ensemble: ROC-AUC increases score to 0.84298\n",
      "Adding model_2 to the ensemble: ROC-AUC increases score to 0.86533\n",
      "Adding model_1 to the ensemble: ROC-AUC increases score to 0.86644\n",
      "Adding model_3 to the ensemble: ROC-AUC increases score to 0.86691\n",
      "Adding model_2 to the ensemble: ROC-AUC increases score to 0.86779\n",
      "Cannot improve furthermore - Stopping\n"
     ]
    }
   ],
   "source": [
    "iterations = 100\n",
    "\n",
    "proba = np.stack([model_1.predict_proba(X_holdout)[:, 1],\n",
    "                  model_2.predict_proba(X_holdout)[:, 1],\n",
    "                  model_3.predict_proba(X_holdout)[:, 1]]).T\n",
    "\n",
    "baseline = 0.5\n",
    "print(f\"starting baseline is {baseline:0.5f}\")\n",
    "\n",
    "models = []\n",
    "\n",
    "for i in range(iterations):\n",
    "    challengers = list()\n",
    "    for j in range(proba.shape[1]):\n",
    "        new_proba = np.stack(proba[:, models + [j]])\n",
    "        score = roc_auc_score(y_true=y_holdout, \n",
    "                              y_score=np.mean(new_proba, axis=1))\n",
    "        challengers.append([score, j])\n",
    "    \n",
    "    challengers = sorted(challengers, key=lambda x: x[0], reverse=True)\n",
    "    best_score, best_model = challengers[0]\n",
    "    if best_score > baseline:\n",
    "        print(f\"Adding model_{best_model+1} to the ensemble\", end=': ') \n",
    "        print(f\"ROC-AUC increases score to {best_score:0.5f}\")\n",
    "        models.append(best_model)\n",
    "        baseline = best_score\n",
    "    else:\n",
    "        print(\"Cannot improve further - Stopping\")\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b00263bd",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:45.810300Z",
     "iopub.status.busy": "2021-09-19T19:53:45.809596Z",
     "iopub.status.idle": "2021-09-19T19:53:45.814804Z",
     "shell.execute_reply": "2021-09-19T19:53:45.815353Z",
     "shell.execute_reply.started": "2021-09-19T19:37:38.801526Z"
    },
    "papermill": {
     "duration": 0.039426,
     "end_time": "2021-09-19T19:53:45.815519",
     "exception": false,
     "start_time": "2021-09-19T19:53:45.776093",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{2: 0.4, 1: 0.4, 0: 0.2}\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "freqs = Counter(models)\n",
    "weights = {key: freq/len(models) for key, freq in freqs.items()}\n",
    "print(weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6e9d0916",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:53:45.887399Z",
     "iopub.status.busy": "2021-09-19T19:53:45.886745Z",
     "iopub.status.idle": "2021-09-19T19:54:10.573509Z",
     "shell.execute_reply": "2021-09-19T19:54:10.572880Z",
     "shell.execute_reply.started": "2021-09-19T19:37:38.808131Z"
    },
    "papermill": {
     "duration": 24.726948,
     "end_time": "2021-09-19T19:54:10.573682",
     "exception": false,
     "start_time": "2021-09-19T19:53:45.846734",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "\n",
    "kf = KFold(n_splits=5, shuffle=True, random_state=0)\n",
    "scores = list()\n",
    "\n",
    "first_lvl_oof = np.zeros((len(X_train), 3))\n",
    "fist_lvl_preds = np.zeros((len(X_test), 3))\n",
    "\n",
    "for k, (train_index, val_index) in enumerate(kf.split(X_train)):\n",
    "    model_1.fit(X_train[train_index, :], y_train[train_index])\n",
    "    first_lvl_oof[val_index, 0] = model_1.predict_proba(X_train[val_index, :])[:, 1]\n",
    "    \n",
    "    model_2.fit(X_train[train_index, :], y_train[train_index])\n",
    "    first_lvl_oof[val_index, 1] = model_2.predict_proba(X_train[val_index, :])[:, 1]\n",
    "    \n",
    "    model_3.fit(X_train[train_index, :], y_train[train_index])\n",
    "    first_lvl_oof[val_index, 2] = model_3.predict_proba(X_train[val_index, :])[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "337c5322",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:54:10.643901Z",
     "iopub.status.busy": "2021-09-19T19:54:10.643251Z",
     "iopub.status.idle": "2021-09-19T19:54:18.328525Z",
     "shell.execute_reply": "2021-09-19T19:54:18.327989Z",
     "shell.execute_reply.started": "2021-09-19T19:38:03.642445Z"
    },
    "papermill": {
     "duration": 7.723602,
     "end_time": "2021-09-19T19:54:18.328687",
     "exception": false,
     "start_time": "2021-09-19T19:54:10.605085",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_1.fit(X_train, y_train)\n",
    "fist_lvl_preds[:, 0] = model_1.predict_proba(X_test)[:, 1]\n",
    "\n",
    "model_2.fit(X_train, y_train)\n",
    "fist_lvl_preds[:, 1] = model_2.predict_proba(X_test)[:, 1]\n",
    "\n",
    "model_3.fit(X_train, y_train)\n",
    "fist_lvl_preds[:, 2] = model_3.predict_proba(X_test)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1cdd4d8d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:54:18.399091Z",
     "iopub.status.busy": "2021-09-19T19:54:18.398383Z",
     "iopub.status.idle": "2021-09-19T19:54:43.019163Z",
     "shell.execute_reply": "2021-09-19T19:54:43.018593Z",
     "shell.execute_reply.started": "2021-09-19T19:38:11.321141Z"
    },
    "papermill": {
     "duration": 24.659889,
     "end_time": "2021-09-19T19:54:43.019324",
     "exception": false,
     "start_time": "2021-09-19T19:54:18.359435",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "second_lvl_oof = np.zeros((len(X_train), 3))\n",
    "second_lvl_preds = np.zeros((len(X_test), 3))\n",
    "\n",
    "for k, (train_index, val_index) in enumerate(kf.split(X_train)):\n",
    "    skip_X_train = np.hstack([X_train, first_lvl_oof])\n",
    "    model_1.fit(skip_X_train[train_index, :], y_train[train_index])\n",
    "    second_lvl_oof[val_index, 0] = model_1.predict_proba(skip_X_train[val_index, :])[:, 1]\n",
    "    \n",
    "    model_2.fit(skip_X_train[train_index, :], y_train[train_index])\n",
    "    second_lvl_oof[val_index, 1] = model_2.predict_proba(skip_X_train[val_index, :])[:, 1]\n",
    "    \n",
    "    model_3.fit(skip_X_train[train_index, :], y_train[train_index])\n",
    "    second_lvl_oof[val_index, 2] = model_3.predict_proba(skip_X_train[val_index, :])[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "e9f454b8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:54:43.087800Z",
     "iopub.status.busy": "2021-09-19T19:54:43.087125Z",
     "iopub.status.idle": "2021-09-19T19:54:50.759678Z",
     "shell.execute_reply": "2021-09-19T19:54:50.760214Z",
     "shell.execute_reply.started": "2021-09-19T19:38:36.043148Z"
    },
    "papermill": {
     "duration": 7.710139,
     "end_time": "2021-09-19T19:54:50.760457",
     "exception": false,
     "start_time": "2021-09-19T19:54:43.050318",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "skip_X_test = np.hstack([X_test, fist_lvl_preds])\n",
    "\n",
    "model_1.fit(skip_X_train, y_train)\n",
    "second_lvl_preds[:, 0] = model_1.predict_proba(skip_X_test)[:, 1]\n",
    "\n",
    "model_2.fit(skip_X_train, y_train)\n",
    "second_lvl_preds[:, 1] = model_2.predict_proba(skip_X_test)[:, 1]\n",
    "\n",
    "model_3.fit(skip_X_train, y_train)\n",
    "second_lvl_preds[:, 2] = model_3.predict_proba(skip_X_test)[:, 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "c140496e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-09-19T19:54:50.832068Z",
     "iopub.status.busy": "2021-09-19T19:54:50.831395Z",
     "iopub.status.idle": "2021-09-19T19:54:50.837431Z",
     "shell.execute_reply": "2021-09-19T19:54:50.836918Z",
     "shell.execute_reply.started": "2021-09-19T19:38:43.714635Z"
    },
    "papermill": {
     "duration": 0.04513,
     "end_time": "2021-09-19T19:54:50.837572",
     "exception": false,
     "start_time": "2021-09-19T19:54:50.792442",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stacking ROC-AUC is: 0.90424\n"
     ]
    }
   ],
   "source": [
    "arithmetic = second_lvl_preds.mean(axis=1)\n",
    "ras = roc_auc_score(y_true=y_test, y_score=arithmetic)\n",
    "scores.append(ras)\n",
    "print(f\"Stacking ROC-AUC is: {ras:0.5f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.9"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 119.797966,
   "end_time": "2021-09-19T19:54:52.277615",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-09-19T19:52:52.479649",
   "version": "2.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
